Concordance (Word Count)¶

Find the ten most commonly used words in a text file.

with open('paradise-lost.txt', encoding='utf-8-sig') as f:
    text = ''.join(f)
# print(text[:50])               # 'The Project Gutenberg EBook of Paradise Lost, by J'

Method #1¶

def concordance(text):
    freq = {}
    for word in text.split():
        if word not in freq:
            freq[word] = 0
        freq[word] += 1
    return freq

print(concordance(text))

Output:

{'The': 475,
 'Project': 84,
 'Gutenberg': 25,
 . . .
 '2007]': 1,
 'Language:': 1,
 'English': 1,
 'Character': 1,
 'set': 36,
 'encoding:': 1,
 'ASCII': 1,
 '***': 6,
 . . .

Modifications: only words and all in lowercase¶

from re import sub

def concordance(text):
    freq = {}
    for word in text.split():
        word = sub('[^\w]', '', word.lower())   # not words -> empty string
        if word not in freq:
            freq[word] = 0
        freq[word] += 1
    return freq

freq = concordance(text)
print(sorted(freq.items(), key=lambda kv: kv[1], reverse=True)[:10])

Output:

[('and', 3483),
 ('the', 3162),
 ('to', 2326),
 ('of', 2186),
 ('in', 1430),
 ('with', 1208),
 ('his', 1181),
 ('or', 795),
 ('that', 720),
 ('all', 712)]

Method #2 (initialize and assign in one step)¶

from re import sub

def concordance(text):
    freq = {}
    for word in text.split():
        word = sub('[^\w]', '', word.lower())
        freq[word] = freq.get(word, 0) + 1    # or freq.setdefault(word, 0) - initialize and assign
        # freq.setdefault(word, 0) += 1       # does not work, we can't assign to the function call
    return freq

freq = concordance(text)
print(sorted(freq.items(), key=lambda kv: kv[1], reverse=True)[:10])

Method #3 (defaultdict)¶

Template:

from collections import defaultdict
x = defaultdict(int)
x['asdfasdf']
x[1]
print(x)                                   # defaultdict(int, {'asdfasdf': 0, 1: 0})

Code:

from re import sub
from collections import defaultdict

def concordance(text):
    freq = defaultdict(int)                # <--
    for word in text.split():
        word = sub('[^\w]', '', word.lower())
        freq[word] += 1                    # <--
    return freq

freq = concordance(text)
print(sorted(freq.items(), key=lambda kv: kv[1], reverse=True)[:10])

Method #4 (Counter)¶

Template:

from collections import Counter
print(Counter('aabbbc'))         # Counter({'a': 2, 'b': 3, 'c': 1}))

Code:

from re import sub
from collections import Counter

def concordance(text):
    return Counter(sub('[^\w]', '', word.lower()) for word in text.split())

freq = concordance(text)
print(sorted(freq.items(), key=lambda kv: kv[1], reverse=True)[:10])

Output:

[('and', 3483),
 ('the', 3162),
 ('to', 2326),
 ('of', 2186),
 ('in', 1430),
 ('with', 1208),
 ('his', 1181),
 ('or', 795),
 ('that', 720),
 ('all', 712)]

Table of Contents

Previous topic

Next topic

Concordance (Word Count)¶

Method #1¶

Modifications: only words and all in lowercase¶

Method #2 (initialize and assign in one step)¶

Method #3 (defaultdict)¶

Method #4 (Counter)¶